{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-*-coding:utf8-*-\n",
    "\n",
    "\"\"\"\n",
    "author:YJM\n",
    "\n",
    "date:20190725   \n",
    "\n",
    "util function\n",
    "\n",
    "\"\"\"\n",
    "from __future__ import division #导入精确除法\n",
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import operator\n",
    "from scipy.sparse import coo_matrix #引入稀疏矩阵模块\n",
    "from scipy.sparse.linalg import gmres#用于解Ax=b这一类的线性方程\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过movies.csv获取电影信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_item_info(input_file):\n",
    "    if not os.path.exists(input_file):\n",
    "        return {}\n",
    "    item_info={}\n",
    "    linenum=0\n",
    "    fp = open(input_file)\n",
    "    for line in fp:\n",
    "        if linenum == 0:\n",
    "            linenum += 1\n",
    "            continue\n",
    "        item = line.strip().split(',')\n",
    "        if len(item)<3:\n",
    "            continue\n",
    "        elif len(item) == 3:\n",
    "            itemid,title,genre = item[0],item[1],item[2]\n",
    "        elif len(item)>3:\n",
    "            itemid = item[0]\n",
    "            genre = item[-1]\n",
    "            title = ','.join(item[1:-1])\n",
    "        item_info[itemid]=[title,genre]\n",
    "    fp.closed\n",
    "    return item_info"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 图算法的数据格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_graph_from_data(input_file):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        input_file:user item rating file\n",
    "    Return:\n",
    "        a dict:{User A:{itemb:1,itemc:1},itemb:{UserA:1}}\n",
    "    \"\"\"\n",
    "    if not os.path.exists(input_file):\n",
    "        return {}   \n",
    "    graph={}\n",
    "    linenum =0\n",
    "    score_thr=4.0\n",
    "    fp = open(input_file)\n",
    "    for line in fp:\n",
    "        if linenum ==0:\n",
    "            linenum +=1\n",
    "            continue\n",
    "        item = line.strip().split(\",\")\n",
    "        if len(item)<3:\n",
    "            continue\n",
    "        userid,itemid,rating =item[0],\"item_\"+item[1],item[2]\n",
    "        if float(rating)<score_thr:\n",
    "            continue\n",
    "        if userid not in graph:\n",
    "            graph[userid] ={}\n",
    "        graph[userid][itemid]=1\n",
    "        if itemid not in graph:\n",
    "            graph[itemid]={}\n",
    "        graph[itemid][userid] = 1\n",
    "    fp.close()\n",
    "    return graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph=get_graph_from_data(\"../data/ratings15000.csv\")\n",
    "# graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将图转成M矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def graph_to_m(graph):\n",
    "    \"\"\"\n",
    "    Arges:   \n",
    "        graph:二分图\n",
    "    Return:\n",
    "       a coo_matrix sparse mat M 稀疏矩阵\n",
    "       a list ,total user item point\n",
    "       a dict ,map all the point to row index\n",
    "    \"\"\"\n",
    "    vertex = graph.keys()#所有的顶点\n",
    "    address_dict = {}#记录所有顶点位置的数据结构\n",
    "    total_len = len(vertex)\n",
    "    for index in range(len(vertex)):\n",
    "#  address_dict[vertex[index]]=index      #注意这里是python2和python3不同书写的地方\n",
    "        address_dict.update({list(vertex)[index]:index})#知道了每一行对应哪个顶点\n",
    "    row = []\n",
    "    col = []\n",
    "    data = []\n",
    "    for element_i in graph:\n",
    "        #顶点i到顶点j有没有连通，如果有就是顶点i出度的倒数\n",
    "        weight =round(1/len(graph[element_i]),3)\n",
    "        row_index =  address_dict[element_i] #i顶点对应的列索引\n",
    "        for  element_j in graph[element_i]:\n",
    "            col_index = address_dict[element_j]\n",
    "            row.append(row_index)\n",
    "            col.append(col_index)\n",
    "            data.append(weight)\n",
    "    row = np.array(row)     \n",
    "    col = np.array(col)\n",
    "    data =np.array(data)\n",
    "    m = coo_matrix((data,(row,col)),shape=(total_len,total_len))\n",
    "    return m,vertex,address_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mat_all_point(m_mat,vertex,alpha):\n",
    "    \"\"\"\n",
    "    get E - alpha*m_mat.T\n",
    "    Args:\n",
    "        m_mat:单位阵\n",
    "        vertex:total item and user point M矩阵\n",
    "        alpha:the prob for random walking随机游走的概率 \n",
    "    Return:\n",
    "        a sparse\n",
    "    \"\"\"\n",
    "    total_len =len(vertex)#单位阵和M矩阵行列数相等\n",
    "    row =[]\n",
    "    col =[]\n",
    "    data=[]\n",
    "    for index in range(total_len):\n",
    "        row.append(index)\n",
    "        col.append(index)\n",
    "        data.append(1)\n",
    "    row =np.array(row)\n",
    "    col =np.array(col)\n",
    "    data =np.array(data)\n",
    "    eye_t=  coo_matrix ((data,(row,col)),shape=(total_len,total_len))#构建单位阵结束\n",
    "    #输出格式使用csv格式这样可以使运算变得快速一点，\n",
    "    return eye_t.tocsr() - alpha*m_mat.tocsr().transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph=get_graph_from_data(\"../data/log.txt\")\n",
    "m,vertex,address_dict = graph_to_m(graph)\n",
    "# print(mat_all_point(m,vertex,8).todense())\n",
    "# print(address_dict)\n",
    "# print(m.todense())\n",
    "# print(m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 传统的personalRank算法模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def personal_rank(graph,root,alpha,iter_num,recom_num=10):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        graph:user item graph\n",
    "        root:指定要推荐的用户\n",
    "        alpha：以alpha的概率选择随机游走，以1-alpha的概率回到起点\n",
    "        item_num:迭代轮次\n",
    "        recom_num=10:指定迭代轮次\n",
    "    Return:\n",
    "        \n",
    "    \"\"\"\n",
    "    rank = {}\n",
    "    rank = {point:0 for point in graph}#将除了root顶点以外，其他所有顶点初始化为0\n",
    "    rank[root] = 1#root顶点初始化成1\n",
    "    recom_result={}#输出的数据结构\n",
    "    for iter_index in range(iter_num):\n",
    "        tmp_rank = {}\n",
    "        tmp_rank = {point:0 for point in graph}#该迭代轮次下其余顶点到root顶点的pr值\n",
    "        #如果该顶点不是root顶点,那么所有连接该顶点的顶点的pr值以1/N的概率贡献给这个顶点\n",
    "        for out_point,out_dict in graph.items():\n",
    "            for inner_point,value in graph[out_point].items():\n",
    "#                 如果顶点不是root顶点（公式的上半部分）\n",
    "                tmp_rank[inner_point] +=round(alpha*rank[out_point]/len(out_dict),4)\n",
    "#                公式的下半部分\n",
    "                if inner_point == root:\n",
    "                    tmp_rank[inner_point] +=round(1-alpha,4)\n",
    "#         迭代充分了提前结束迭代\n",
    "        if tmp_rank ==rank:\n",
    "            print(\"out\"+str(iter_index))#查看是否提前结束迭代\n",
    "            break\n",
    "#         如果没有完全迭代充分，就要赋值给rank这个数据结构\n",
    "        rank = tmp_rank\n",
    "    \n",
    "    right_num = 0#定义一个计数器\n",
    "    \n",
    "#     将rank这个结构根据pr值的得分进行排序，并过滤掉User顶点和root顶点已经行为过的item \n",
    "    for zuhe in sorted(rank.items(),key=operator.itemgetter(1),reverse=True):\n",
    "        point,pr_score =zuhe[0],zuhe[1]\n",
    "        if len(point.split('_'))<2:#如果不是item顶点就过滤掉\n",
    "            continue\n",
    "        if point in graph[root]:#如果被root顶点行为过，同样过滤\n",
    "            continue\n",
    "        recom_result[point] = pr_score #结果装载进数据集\n",
    "        right_num += 1\n",
    "        if right_num >recom_num:\n",
    "            break#迭代轮次结束\n",
    "    return recom_result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将personalRank矩阵化算法模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def personal_rank_mat(graph,root,alpha,recom_num=10):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        graph:用户 物品 二分图\n",
    "        root:固定推荐的用户\n",
    "        alpha:随机游走概率\n",
    "        recom_num:推荐物品数目\n",
    "    Return：\n",
    "        a dict,key:itemid,value:pr score\n",
    "    \"\"\"\n",
    "#     m矩阵，m矩阵所有顶点的集合，所有顶点对应的行号\n",
    "    m,vertex,address_dict =graph_to_m(graph)\n",
    "    if root not in address_dict:\n",
    "        return {}\n",
    "    score_dict ={}\n",
    "    recom_dict ={}\n",
    "    mat_all = mat_all_point(m,vertex,alpha)\n",
    "    index = address_dict[root]#root顶点对应的行号，index的目的是为了得到r0矩阵\n",
    "    initial_list =[[0] for row in range(len(vertex))]#初始化r0矩阵\n",
    "    initial_list[index] =[1]\n",
    "    r_zero = np.array(initial_list)\n",
    "    #解线性方程得到一个元组，第一个元素是所有顶点对该root顶点的pr值得分\n",
    "    res =  gmres(mat_all,r_zero,tol=1e-8)[0]\n",
    "\n",
    "    \n",
    "    for index in range(len(res)):\n",
    "        \n",
    "#         point = vertex[index]#这里又是python2和python3不同书写的地方\n",
    "        point =list(vertex)[index]\n",
    "        \n",
    "        address_dict.update({list(vertex)[index]:index})#知道了每一行对应哪个顶点\n",
    "\n",
    "        if len(point.strip().split(\"_\"))<2:#如果不是item顶点就过滤掉\n",
    "            continue\n",
    "        if point in graph[root]:#如果被root顶点行为过，同样过滤\n",
    "            continue\n",
    "        score_dict[point] = round(res[index],3)\n",
    "    for zuhe in sorted(score_dict.items(),key = operator.itemgetter(1),reverse=True):\n",
    "        point,score = zuhe[0],zuhe[1]\n",
    "        recom_dict[point]=score\n",
    "    return recom_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_one_user_recom():\n",
    "    \"\"\"\n",
    "    give one fix user recom result\n",
    "    \"\"\"\n",
    "    user =\"1\"# A\n",
    "    alpha = 0.8\n",
    "#     graph = get_graph_from_data(\"../data/log.txt\")\n",
    "    graph =get_graph_from_data(\"../data/ratings15000.csv\")\n",
    "    iter_num = 100  \n",
    "    recom_result=personal_rank(graph,user,alpha,100)\n",
    "    return recom_result\n",
    "#     item_info = get_item_info(\"../data/movies.csv\")\n",
    "#     将用户感兴趣的物品打印出来分析结果\n",
    "#     for itemid in graph[user]:\n",
    "#         pure_itemid = itemid.split(\"_\")[1]\n",
    "#         print(item_info[pure_itemid])\n",
    "#     print(\"result------------\")    \n",
    "#     for itemid in recom_result:\n",
    "#         pure_itemid = itemid.split(\"_\")[1]\n",
    "#         print(item_info[pure_itemid])\n",
    "#         print(recom_result[itemid])    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_one_user_by_mat():\n",
    "    \"\"\"\n",
    "    give one fix user by mat\n",
    "    \"\"\"\n",
    "    user =\"1\"# A\n",
    "    alpha = 0.8\n",
    "#     graph = get_graph_from_data(\"../data/log.txt\")\n",
    "    graph =get_graph_from_data(\"../data/ratings15000.csv\")\n",
    "    recom_result=personal_rank_mat(graph,user,alpha,100)\n",
    "    return recom_result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比两种算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "out25\n",
      "11\n"
     ]
    }
   ],
   "source": [
    "recom_result__base = get_one_user_recom()\n",
    "recom_result__mat = get_one_user_by_mat()\n",
    "num =0\n",
    "for ele in recom_result__base:\n",
    "    if ele in recom_result__mat:\n",
    "        num +=1\n",
    "print(num) #重合度大概四分之三左右"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
